{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')  # append this repo to PYTHONPATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "import math\n",
    "\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.annotations.add_bounding_boxes_to_megadb import *\n",
    "from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sul Ross 2019 spring"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_to_output = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_2020/sulross_2019_spring_megadb.json'  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Name of the dataset**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'sulross_2019_spring'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done. Prefix is `Spring2019`. Note that for dataset `sulross_2018` there is no prefix in the dataset entry."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1b - If you're starting from scratch..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_folder = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Engagements/SulRoss/share_microsoft/LionSpring2019Labels'\n",
    "\n",
    "# some folders were renamed, so using the API output to map to path in blob storage\n",
    "api_output_path = '/Users/siyuyang/Source/temp_data/CameraTrap/engagements/SulRoss/20191015_Spring2019/4092_detections_SulRossSpring2019_20191014211922.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(api_output_path) as f:\n",
    "    detection_res = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_name_to_path = {}\n",
    "\n",
    "for im in detection_res['images']:\n",
    "    fn = os.path.basename(im['file'])\n",
    "    image_name_to_path[fn] = '/'.join(im['file'].split('/')[1:])  # delete prefix Spring2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "del detection_res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "li = []\n",
    "for csv_file in os.listdir(label_folder):\n",
    "    print(csv_file)\n",
    "    csv_path = os.path.join(label_folder, csv_file)\n",
    "    df = pd.read_csv(csv_path, index_col=None, header=0)\n",
    "    li.append(df)\n",
    "\n",
    "timelapse_df = pd.concat(li, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "602438"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "File             object\n",
       "RelativePath    float64\n",
       "Folder           object\n",
       "Date             object\n",
       "Time             object\n",
       "ImageQuality     object\n",
       "DeleteFlag         bool\n",
       "County           object\n",
       "Survey           object\n",
       "Analyst          object\n",
       "Notes            object\n",
       "Publicity          bool\n",
       "Empty              bool\n",
       "Person             bool\n",
       "Animal             bool\n",
       "Species          object\n",
       "species2         object\n",
       "species3        float64\n",
       "Unnamed: 18     float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(timelapse_df)\n",
    "timelapse_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "timelapse_df = timelapse_df.drop(columns='Unnamed: 18')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "timelapse_df.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "602438it [01:13, 8144.85it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "14"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entries_im_not_stored = []\n",
    "\n",
    "for i_row, row in tqdm(timelapse_df.iterrows()):\n",
    "    fn = row['File']\n",
    "    if not fn in image_name_to_path:\n",
    "        entries_im_not_stored.append(fn)\n",
    "        \n",
    "len(entries_im_not_stored)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "602438it [02:36, 3860.95it/s]\n"
     ]
    }
   ],
   "source": [
    "embedded = []  # list of images with all attributes at the image-level\n",
    "unidentified_animal = []\n",
    "\n",
    "for i_row, row in tqdm(timelapse_df.iterrows()):\n",
    "    fn = row['File']\n",
    "    path = image_name_to_path.get(fn, False)\n",
    "    if path is False:\n",
    "        continue\n",
    "    \n",
    "    # using the file name only to determine the seq_id, frame_num and location\n",
    "    p = fn.split('-')\n",
    "    seq_id = '-'.join(p[:-1])\n",
    "\n",
    "    frame_num = int(fn.split('(')[1].split(')')[0])\n",
    "    \n",
    "    location = fn.split('__')[1].split('_')[0]\n",
    "    \n",
    "    # other attributes from the csv columns\n",
    "    date_time = row['Date'] + ' ' + row['Time']\n",
    "    \n",
    "    is_empty = row['Empty']\n",
    "    has_person = row['Person']\n",
    "    has_animal = row['Animal']\n",
    "    \n",
    "    raw_classes = []\n",
    "    if not pd.isnull(row['Species']):\n",
    "        if not(row['Species'] is None or row['Species'].lower() == 'none'):\n",
    "            raw_classes.append(row['Species'])\n",
    "    if not pd.isnull(row['species2']):\n",
    "        raw_classes.append(row['species2'])\n",
    "    if not pd.isnull(row['species3']):\n",
    "        raw_classes.append(row['species3'])\n",
    "        \n",
    "    animal_classes = []\n",
    "    for a in raw_classes:\n",
    "        animal_classes.append(a.lower().replace('_', ''))  # _skunk to skunk\n",
    "\n",
    "    # this happens - the one sample I looked had a person, so appending 'human' below  \n",
    "    if has_person and is_empty:\n",
    "        print(row)\n",
    "        break\n",
    "\n",
    "    # happens; be conservative and note as unidentified\n",
    "#     if has_animal and is_empty:\n",
    "#         print(row)\n",
    "#         break\n",
    "    \n",
    "    if has_animal and len(animal_classes) == 0:\n",
    "        unidentified_animal.append(row)\n",
    "        animal_classes.append('unidentified')\n",
    "    \n",
    "    if has_person:\n",
    "        animal_classes.append('human')\n",
    "    \n",
    "    if len(animal_classes) == 0:\n",
    "        animal_classes = ['empty']\n",
    "    else:\n",
    "        animal_classes = list(set(animal_classes))\n",
    "        \n",
    "    embedded.append({\n",
    "        'file': path,\n",
    "        'seq_id': seq_id,\n",
    "        'frame_num': frame_num,\n",
    "        'location': location,\n",
    "        'datetime': date_time,\n",
    "        'class': animal_classes\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset_name is set to sulross_2019_spring. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 25%|██▍       | 148172/602424 [00:00<00:00, 740771.52it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 602424 images into sequences...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 602424/602424 [00:01<00:00, 484425.32it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of sequences: 288158\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n",
      "\n",
      "all_img_properties\n",
      "{'frame_num', 'class', 'file', 'location', 'datetime'}\n",
      "\n",
      "img_level_properties\n",
      "{'file', 'class', 'datetime', 'frame_num'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'location'}\n",
      "\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "{'seq_id': 'Spring2019__S1_BadTimes__1920-08-13__07-28', 'dataset': 'sulross_2019_spring', 'images': [{'file': 'S1_BadTimes/Spring2019__S1_BadTimes__1920-08-13__07-28-46(1).JPG', 'frame_num': 1, 'datetime': '13-Aug-1920 07:28:46', 'class': ['empty']}], 'location': 'S1'}\n",
      "\n",
      "[{'seq_id': 'Spring2019__S2__2019-03-29__12-02', 'dataset': 'sulross_2019_spring', 'images': [{'file': 'S2/Spring2019__S2__2019-03-29__12-02-11(1).JPG', 'frame_num': 1, 'datetime': '29-Mar-2019 12:02:11', 'class': ['empty']}, {'file': 'S2/Spring2019__S2__2019-03-29__12-02-22(2).JPG', 'frame_num': 2, 'datetime': '29-Mar-2019 12:02:22', 'class': ['empty']}], 'location': 'S2'}]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sequences = process_sequences(embedded, dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for seq in sequences:\n",
    "    frame_numbers = [im['frame_num'] for im in seq['images']]\n",
    "    if len(frame_numbers) != len(set(frame_numbers)):\n",
    "        im_dict = {}\n",
    "        for im in seq['images']:\n",
    "            im_dict[im['file']] = im\n",
    "        deduped_im = list(im_dict.values())\n",
    "        seq['images'] = deduped_im"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# some entries has non-unique frame num - the image entries are duplicated in these\n",
    "problem_seqs = []\n",
    "\n",
    "for seq in sequences:\n",
    "    frame_numbers = [im['frame_num'] for im in seq['images']]\n",
    "    if len(frame_numbers) != len(set(frame_numbers)):\n",
    "        problem_seqs.append(seq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "90"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(problem_seqs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Looks like these are all of label \"empty\" and have (x) that are duplicated in the sequence. Exclude them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "new_sequences = []\n",
    "for seq in sequences:\n",
    "    frame_numbers = [im['frame_num'] for im in seq['images']]\n",
    "    if len(frame_numbers) == len(set(frame_numbers)):\n",
    "        new_sequences.append(seq)\n",
    "        \n",
    "sequences = new_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check\n",
    "\n",
    "Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.\n",
    "\n",
    "If the format conforms, the following messages will be printed:\n",
    "\n",
    "```\n",
    "Verified that the sequence items meet requirements not captured by the schema.\n",
    "Verified that the sequence items conform to the schema.\n",
    "```\n",
    "\n",
    "For large datasets, the second step will take some time (~ a minute). \n",
    "\n",
    "Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n"
     ]
    }
   ],
   "source": [
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sample some sequences to make sure things look good\n",
    "\n",
    "sample(sequences, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "species_present = set()\n",
    "for seq in sequences:\n",
    "    for im in seq['images']:\n",
    "        if im['class'][0] != 'empty':\n",
    "            species_present.update(im['class'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "species_present"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 4 - Save the `sequence` items to a file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output, 'w') as f:\n",
    "    json.dump(sequences, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
